#!/bin/bash

# main script that orchestrates the crawling process
#


# 1. rotate log (last 6) and data files (last 3)
# 2. launch crawler
# 3. check results if not scrapped at least threshold items send email (from crawler@hmss.com to peter.dermek@20deka.sk)
# 4. if ok write scrapped items to db

# options -l log dir  (defaults to /var/log/hmss)
# 		 -d data dir (defaults to /var/data/hmss)

LOG_DIR=/var/log/hmss
DATA_DIR=/var/data/hmss

MAX_NUM_LOG_FILES=6
MAX_NUM_WRITER_FILES=3
MAX_NUM_DATA_FILES=3
MIN_NUM_SCRAPED_ITEMS=100

function usage {
	echo "hide my ass crawler and db writer"
	echo "`basename $0` [-l <log dir>] [-d <data dir>]"
	echo "log dir - defaults to /var/log/hmss"
	echo "data dir - defaults to /var/data/hmss"
}

function rotate_files {
	FILE_DIR=$1
	FILE_PREFIX=$2
	MAX_NUM_FILES=$3

	NUM_FILES=`ls $FILE_DIR/$FILE_PREFIX* | wc -l`
	if [ $NUM_FILES -gt $MAX_NUM_FILES ]
	then 
		NUM_FILES=`expr $NUM_FILES - 1`
		rm $FILE_DIR/$FILE_PREFIX.$NUM_FILES
	fi

	while [ $NUM_FILES -gt 0 ]
	do
		CUR_FILE_SUFFIX=`expr $NUM_FILES - 1`
		NEXT_FILE_SUFFIX=`expr $CUR_FILE_SUFFIX + 1`
		if [ $CUR_FILE_SUFFIX -gt 0 ]
		then
			mv $FILE_DIR/$FILE_PREFIX.$CUR_FILE_SUFFIX $FILE_DIR/$FILE_PREFIX.$NEXT_FILE_SUFFIX
		else
			mv $FILE_DIR/$FILE_PREFIX $FILE_DIR/$FILE_PREFIX.1
		fi
		NUM_FILES=`expr $NUM_FILES - 1`
	done
}

while getopts l:d: FLAG; do
	case $FLAG in
    l)
		LOG_DIR=$OPTARG;
     	;; 
    d)
		DATA_DIR=$OPTARG;
      	;;
    ?)
     	usage;
      	;;
  esac
done

shift $(( OPTIND - 1 ));

#
# rotate files
#

# rotate crawler files
test -f $LOG_DIR/crawler.log && rotate_files $LOG_DIR crawler.log $MAX_NUM_LOG_FILES 

# rotate writer files
test -f $LOG_DIR/writer.log && rotate_files $LOG_DIR writer.log $MAX_NUM_WRITER_FILES 

# rotate data files
test -f $DATA_DIR/hmss.jsonlines && rotate_files $DATA_DIR hmss.jsonlines $MAX_NUM_DATA_FILES 

#
# launch crawler
#

cd $HMSS_HOME_DIR/crawler

DJANGO_SETTINGS_MODULE=hmss_settings PYTHONPATH=$HMSS_HOME_DIR:$PYTHONPATH scrapy crawl hidemyass --logfile $LOG_DIR/crawler.log -o $DATA_DIR/hmss.jsonlines 2>$LOG_DIR/crawler.log

cd $HMSS_HOME_DIR

NUM_SCRAPED_ITEMS=`grep item_scraped_count $LOG_DIR/crawler.log | cut -f2 -d: | tr -d ' ,'`

if [ $NUM_SCRAPED_ITEMS -lt $MIN_NUM_SCRAPED_ITEMS ]
then
	grep -i -A100 "dumping scrapy stats" $LOG_DIR/crawler.log > $LOG_DIR/hmss.stats
	python2.7 ./mailer.py --from_addr hmss@20deka.com --to_addr peter.dermek@20deka.com --subject "Failed to crawl hmss." $LOG_DIR/hmss.stats
	rm $LOG_DIR/hmss.stats
	exit 1
fi

#
# write scraped items do db
#

DJANGO_SETTINGS_MODULE="hmss_settings" PYTHONPATH=$HMSS_HOME_DIR:$PYTHONPATH python2.7 ippool/writer.py -v $DATA_DIR/hmss.jsonlines > $LOG_DIR/writer.log 2>$LOG_DIR/writer.log 
